Install packages

# if (!requireNamespace("BiocManager", quietly = TRUE))
#     install.packages("BiocManager")
# BiocManager::install()
# BiocManager::install("biomaRt")
# install.packages("tidyverse")
# install.packages("readxl")
# BiocManager::install("ComplexHeatmap")
# install.packages("matrixStats")
# install.packages("pheatmap")
# install.packages("RVAideMemoire")
# install.packages("dendextend")
# install.packages("binom")
# BiocManager::install("DESeq2")
# install.packages("corrplot")
# BiocManager::install("apeglm")
# install.packages("ashr")

Load packages

Load necessary packages

## Loading required package: S4Vectors
## Warning: package 'S4Vectors' was built under R version 4.1.1
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind, colnames,
##     dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
##     grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
##     order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
##     rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
##     union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.1.1
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Warning: package 'MatrixGenerics' was built under R version 4.1.1
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## corrplot 0.92 loaded
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## Warning: package 'readr' was built under R version 4.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::collapse()   masks IRanges::collapse()
## x dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## x dplyr::count()      masks matrixStats::count()
## x dplyr::desc()       masks IRanges::desc()
## x tidyr::expand()     masks S4Vectors::expand()
## x dplyr::filter()     masks stats::filter()
## x dplyr::first()      masks S4Vectors::first()
## x dplyr::lag()        masks stats::lag()
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename()     masks S4Vectors::rename()
## x dplyr::slice()      masks IRanges::slice()
## Warning: multiple methods tables found for 'union'
## Warning: multiple methods tables found for 'intersect'
## 
## Attaching package: 'InterMineR'
## The following objects are masked from 'package:dplyr':
## 
##     intersect, union
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     intersect, union
## The following objects are masked from 'package:S4Vectors':
## 
##     intersect, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     intersect, union
## The following objects are masked from 'package:base':
## 
##     intersect, union

Data input and processing

Read in the counts data

countsData <- read.delim(file = "../01_input/RWP27_Embryo_Intestine_FACS_200923_all.counts", sep = " ")
head(countsData)
##                  chr start stop strand length embryoCells_rep1
## WBGene00014450 MtDNA     1   55      +     55                0
## WBGene00014451 MtDNA    58  111      +     54                0
## WBGene00010957 MtDNA   113  549      +    437                0
## WBGene00010958 MtDNA   549  783      +    235                0
## WBGene00014452 MtDNA   785  840      +     56                0
## WBGene00014453 MtDNA   842  896      +     55                0
##                embryoGFPminus_rep2 embryoGFPplus_rep2 embryoWhole_rep3
## WBGene00014450                   0                  0                0
## WBGene00014451                   0                  0                0
## WBGene00010957                   0                  0                0
## WBGene00010958                   0                  0                0
## WBGene00014452                   0                  0                0
## WBGene00014453                   0                  0                0
##                embryoCells_rep2 embryoGFPminus_rep3 embryoGFPplus_rep3
## WBGene00014450                0                   0                  0
## WBGene00014451                0                   0                  0
## WBGene00010957                0                   0                  0
## WBGene00010958                0                   0                  0
## WBGene00014452                0                   0                  0
## WBGene00014453                0                   0                  0
##                embryoGFPminus_rep1 embryoGFPplus_rep1 embryoWhole_rep2
## WBGene00014450                   0                  0                0
## WBGene00014451                   0                  0                0
## WBGene00010957                   0                  0                0
## WBGene00010958                   0                  0                0
## WBGene00014452                   0                  0                0
## WBGene00014453                   0                  0                0
colnames(countsData[6:15])
##  [1] "embryoCells_rep1"    "embryoGFPminus_rep2" "embryoGFPplus_rep2" 
##  [4] "embryoWhole_rep3"    "embryoCells_rep2"    "embryoGFPminus_rep3"
##  [7] "embryoGFPplus_rep3"  "embryoGFPminus_rep1" "embryoGFPplus_rep1" 
## [10] "embryoWhole_rep2"

Read in and print metadata file

metadata1 <- read.table(file = "../01_input/RWP27_metadata.tsv", header = FALSE, stringsAsFactors = FALSE)
colnames(metadata1) <- c("Filename.Fwd", "Filename.Rev", "names")

rep <- c(1,1,1,2,2,2,2,3,3,3)
type <- c("cells", "gut", "gutless", "whole", "cells", "gut", "gutless", "whole", "gut", "gutless")
metadata1 <- cbind(metadata1, rep, type)
metadata1
##            Filename.Fwd         Filename.Rev               names rep    type
## 1  RW57_S10_L003_R1_001 RW57_S10_L003_R2_001    embryoCells_rep1   1   cells
## 2  RW58_S11_L003_R1_001 RW58_S11_L003_R2_001  embryoGFPplus_rep1   1     gut
## 3  RW59_S12_L003_R1_001 RW59_S12_L003_R2_001 embryoGFPminus_rep1   1 gutless
## 4  RW60_S13_L003_R1_001 RW60_S13_L003_R2_001    embryoWhole_rep2   2   whole
## 5  RW61_S14_L003_R1_001 RW61_S14_L003_R2_001    embryoCells_rep2   2   cells
## 6  RW62_S15_L003_R1_001 RW62_S15_L003_R2_001  embryoGFPplus_rep2   2     gut
## 7  RW63_S16_L003_R1_001 RW63_S16_L003_R2_001 embryoGFPminus_rep2   2 gutless
## 8  RW64_S17_L003_R1_001 RW64_S17_L003_R2_001    embryoWhole_rep3   3   whole
## 9  RW65_S18_L003_R1_001 RW65_S18_L003_R2_001  embryoGFPplus_rep3   3     gut
## 10 RW66_S19_L003_R1_001 RW66_S19_L003_R2_001 embryoGFPminus_rep3   3 gutless

Factorize metadata

metadata1$names <- factor(
  metadata1$names,
  levels = metadata1$names
)
metadata1$type <-
  factor(metadata1$type, levels = c("gutless", "gut", "cells", "whole"))

Order columns according to metadata1 order

countsData <- countsData  %>% select(chr:length, sort(metadata1$names))
head(countsData)
##                  chr start stop strand length embryoCells_rep1
## WBGene00014450 MtDNA     1   55      +     55                0
## WBGene00014451 MtDNA    58  111      +     54                0
## WBGene00010957 MtDNA   113  549      +    437                0
## WBGene00010958 MtDNA   549  783      +    235                0
## WBGene00014452 MtDNA   785  840      +     56                0
## WBGene00014453 MtDNA   842  896      +     55                0
##                embryoGFPplus_rep1 embryoGFPminus_rep1 embryoWhole_rep2
## WBGene00014450                  0                   0                0
## WBGene00014451                  0                   0                0
## WBGene00010957                  0                   0                0
## WBGene00010958                  0                   0                0
## WBGene00014452                  0                   0                0
## WBGene00014453                  0                   0                0
##                embryoCells_rep2 embryoGFPplus_rep2 embryoGFPminus_rep2
## WBGene00014450                0                  0                   0
## WBGene00014451                0                  0                   0
## WBGene00010957                0                  0                   0
## WBGene00010958                0                  0                   0
## WBGene00014452                0                  0                   0
## WBGene00014453                0                  0                   0
##                embryoWhole_rep3 embryoGFPplus_rep3 embryoGFPminus_rep3
## WBGene00014450                0                  0                   0
## WBGene00014451                0                  0                   0
## WBGene00010957                0                  0                   0
## WBGene00010958                0                  0                   0
## WBGene00014452                0                  0                   0
## WBGene00014453                0                  0                   0

Generate a table called “cts” out of the countsData table. Subset the countsData.

cts <- as.matrix(countsData %>% select(metadata1$names))
head(cts)
##                embryoCells_rep1 embryoGFPplus_rep1 embryoGFPminus_rep1
## WBGene00014450                0                  0                   0
## WBGene00014451                0                  0                   0
## WBGene00010957                0                  0                   0
## WBGene00010958                0                  0                   0
## WBGene00014452                0                  0                   0
## WBGene00014453                0                  0                   0
##                embryoWhole_rep2 embryoCells_rep2 embryoGFPplus_rep2
## WBGene00014450                0                0                  0
## WBGene00014451                0                0                  0
## WBGene00010957                0                0                  0
## WBGene00010958                0                0                  0
## WBGene00014452                0                0                  0
## WBGene00014453                0                0                  0
##                embryoGFPminus_rep2 embryoWhole_rep3 embryoGFPplus_rep3
## WBGene00014450                   0                0                  0
## WBGene00014451                   0                0                  0
## WBGene00010957                   0                0                  0
## WBGene00010958                   0                0                  0
## WBGene00014452                   0                0                  0
## WBGene00014453                   0                0                  0
##                embryoGFPminus_rep3
## WBGene00014450                   0
## WBGene00014451                   0
## WBGene00010957                   0
## WBGene00010958                   0
## WBGene00014452                   0
## WBGene00014453                   0

Reorganize the metadata table so the names2 column are now headers

rownames(metadata1)<- metadata1$names
coldata <- metadata1[,c("names", "rep", "type")]
rownames(coldata) <- as.vector(metadata1$names)
coldata
##                                   names rep    type
## embryoCells_rep1       embryoCells_rep1   1   cells
## embryoGFPplus_rep1   embryoGFPplus_rep1   1     gut
## embryoGFPminus_rep1 embryoGFPminus_rep1   1 gutless
## embryoWhole_rep2       embryoWhole_rep2   2   whole
## embryoCells_rep2       embryoCells_rep2   2   cells
## embryoGFPplus_rep2   embryoGFPplus_rep2   2     gut
## embryoGFPminus_rep2 embryoGFPminus_rep2   2 gutless
## embryoWhole_rep3       embryoWhole_rep3   3   whole
## embryoGFPplus_rep3   embryoGFPplus_rep3   3     gut
## embryoGFPminus_rep3 embryoGFPminus_rep3   3 gutless

Check that the names match –> Should be TRUE

all(rownames(coldata) == colnames(cts))
## [1] TRUE

Contamination thresholding

Determine filtering threshold, identify read count threshold for non-intestine specific genes

tissue_specific_genes <- read_csv(file = "../01_input/tissue_specific_genes_220202.csv", show_col_types = FALSE)
head(tissue_specific_genes)
## # A tibble: 6 × 3
##   WBGeneID       Sequence.name tissue        
##   <chr>          <chr>         <chr>         
## 1 WBGene00000007 aat-6         intestine     
## 2 WBGene00000008 aat-7         intestine     
## 3 WBGene00000016 abf-5         nervous-system
## 4 WBGene00000031 abu-8         intestine     
## 5 WBGene00000034 abu-11        nervous-system
## 6 WBGene00000038 ace-4         nervous-system
cts_long <- as.data.frame(cts) %>% rownames_to_column(var = "WBGeneID") %>% 
  tidyr::pivot_longer(cols = embryoCells_rep1:embryoGFPminus_rep3, values_to = "reads") %>%
  tidyr::separate(name, sep = "_", into = c("sample_type", "rep")) 

cts_long_summary <- cts_long %>% 
  group_by(sample_type, WBGeneID) %>% 
  summarise(mean = mean(reads), variance = var(reads))
## `summarise()` has grouped output by 'sample_type'. You can override using the
## `.groups` argument.
cts_long_summary %>% ggplot(aes(x = log(mean), y = log(variance))) +
  geom_point(alpha = 0.01) +
  facet_wrap(~sample_type)

cts_long_summary %>% 
  left_join(tissue_specific_genes, by = "WBGeneID") %>%
  mutate(tissue = replace_na(tissue, "not_specific")) %>%
  ggplot(aes(x = log(mean), fill = sample_type)) +
  geom_density(kernel = "gaussian", alpha = 0.25) +
  facet_grid(~tissue)
## Warning: Removed 92003 rows containing non-finite values (stat_density).

Make DESeqDataSet

Generate the DESeqDataSet. The variables in this design formula will be the type of sample, and the preparation date. This should reduce the variability between the samples based on when they were made.

From the vignette: “In order to benefit from the default settings of the package, you should put the variable of interest at the end of the formula and make sure the control level is the first level.”

The variable of interest is the sample type.

Using DESeqDataSetFromMatrix since I used the program featureCounts.

dds <- DESeqDataSetFromMatrix(countData = cts,
                              colData = coldata,
                              design = ~ type)

Visualize read count distribution

hist(log(rowSums(counts(dds))))
abline(v = log(10), col = "red", lty = 2)

Filter genes with low read counts

keep <- rowSums(counts(dds)) >=10
dds <- dds[keep,]
dds
## class: DESeqDataSet 
## dim: 22754 10 
## metadata(1): version
## assays(1): counts
## rownames(22754): WBGene00021406 WBGene00021407 ... WBGene00199694
##   WBGene00044951
## rowData names(0):
## colnames(10): embryoCells_rep1 embryoGFPplus_rep1 ...
##   embryoGFPplus_rep3 embryoGFPminus_rep3
## colData names(3): names rep type

Perform Differential Expression

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
resultsNames(dds)
## [1] "Intercept"             "type_gut_vs_gutless"   "type_cells_vs_gutless"
## [4] "type_whole_vs_gutless"
res <- results(dds, contrast = c("type", "gut", "gutless"))
head(res)
## log2 fold change (MLE): type gut vs gutless 
## Wald test p-value: type gut vs gutless 
## DataFrame with 6 rows and 6 columns
##                 baseMean log2FoldChange     lfcSE      stat      pvalue
##                <numeric>      <numeric> <numeric> <numeric>   <numeric>
## WBGene00021406 114.30007       1.064520   0.64308  1.655346 9.78543e-02
## WBGene00021407  16.09971       0.559255   1.43107  0.390794 6.95949e-01
## WBGene00021408  19.52792       7.933028   1.53015  5.184491 2.16606e-07
## WBGene00021405   2.27976       4.557575   2.27585  2.002584 4.52219e-02
## WBGene00021409   2.40590       5.185917   3.06609  1.691378 9.07647e-02
## WBGene00021404  20.94007       8.493339   1.43841  5.904675 3.53343e-09
##                       padj
##                  <numeric>
## WBGene00021406 1.89076e-01
## WBGene00021407 7.89266e-01
## WBGene00021408 2.18657e-06
## WBGene00021405          NA
## WBGene00021409          NA
## WBGene00021404 4.30464e-08

Write results output file.

res_df <- as.data.frame(res)
# write.csv(x = res_df, "./200511_L1_intestine_FACS_gut_vs_gutless.csv", quote = FALSE)
ma_gut_vs_gutless <- plotMA(res, ylim = c(-11,11), alpha = 0.05)

# pdf(file = "./EmbryoFACS_MA_plot_gut_vs_gutless_200930.pdf", 5, 5)
# plotMA(res, ylim = c(-11,11), alpha = 0.05)
# dev.off()
resLFC <- lfcShrink(dds, coef = "type_gut_vs_gutless", type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resApeglm <- lfcShrink(dds, coef = "type_gut_vs_gutless", type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef = "type_gut_vs_gutless", type = "ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
par(mfrow = c(1,3), mar = c(4,4,2,1))
plotMA(resLFC, ylim=c(-10,15), main = "normal")
plotMA(resApeglm, ylim=c(-10,15), main = "apeglm")
plotMA(resAsh, ylim=c(-10,15), main = "ashr")

# write.csv(resApeglm %>% as.data.frame() %>% rownames_to_column(var = "WBGeneID"), file = "./200531_res_gut_vs_gutless_apeglmShrink.csv")

Export the plot

# pdf(file = "./200707_L1FACS_Gut_vs_Nongut_MAplot.pdf", height = 5, width = 5)
# plotMA(resApeglm, ylim=c(-10,10), main = "L1 FACS Gut vs Nongut Differential Expression")
# dev.off()

Export results

resApeglm.df <- as.data.frame(resApeglm) %>% rownames_to_column(var = "WBGeneID")
# write_csv(resApeglm.df, file = "../03_output/DE_Results_GFPplus-vs-GFPminus_apeglmShrink_220202.csv", col_names = TRUE)

rld <- rlog(dds)
rld.df <- as.data.frame(assay(rld)) %>% rownames_to_column(var = "WBGeneID")
# write_csv(rld.df, file = "../03_output/Embryo_Intestine_Rlog_Counts_220202.csv", col_names = TRUE)

Sample-to-sample distance matrix

vsd <- vst(dds, blind = FALSE)
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- vsd$names
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows = sampleDists,
         clustering_distance_cols = sampleDists,
         col = colors)

Export the plot

# pdf(file = "../03_output/EmbryoFACS_Sample_Distance_Matrix_200930.pdf", height = 4, width = 6)
# pheatmap(sampleDistMatrix,
#          clustering_distance_rows = sampleDists,
#          clustering_distance_cols = sampleDists,
#          col = colors)
# dev.off()
plotDispEsts(dds)

select_rows <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:20]
df <- as.data.frame(colData(dds)[,c("type","rep")]) %>% mutate(type = fct_recode(type), type = fct_relevel(type, c("whole", "cells", "gut", "gutless"))) %>% arrange(type)
pheatmap(assay(rld)[select_rows,rownames(df)], cluster_rows=FALSE, show_rownames=FALSE,
         cluster_cols=FALSE, annotation_col=df)

Review automated filtering

metadata(res)$alpha
## [1] 0.1
metadata(res)$filterThreshold
## 13.57143% 
##  2.506608
plot(metadata(res)$filterNumRej,
     type = "b", ylab = "number of rejections",
     xlab = "quantiles of filter")
lines(metadata(res)$lo.fit, col="red")
abline(v=metadata(res)$filterTheta)

par(mfrow=c(2,2),mar=c(2,2,1,1))
ylim <- c(-7,12.5)
resGA <- results(dds, contrast = c("type", "gut", "gutless"), lfcThreshold=2, altHypothesis="greaterAbs")
resLA <- results(dds, contrast = c("type", "gut", "gutless"), lfcThreshold=2, altHypothesis="lessAbs")
resG <- results(dds,contrast = c("type", "gut", "gutless"), lfcThreshold=2, altHypothesis="greater")
resL <- results(dds,contrast = c("type", "gut", "gutless"), lfcThreshold=2, altHypothesis="less")
drawLines <- function() abline(h=c(-2,2),col="dodgerblue",lwd=2)
plotMA(resGA, ylim=ylim); drawLines()
plotMA(resLA, ylim=ylim); drawLines()
plotMA(resG, ylim=ylim); drawLines()
plotMA(resL, ylim=ylim); drawLines()

MA Plot

joined_ma <- plotMA(resApeglm, alpha = 0.01, returnData = TRUE) %>% 
  rownames_to_column(var = "WBGeneID") %>%  
  left_join(tissue_specific_genes, by = "WBGeneID") %>% 
  mutate(tissue = replace_na(tissue, "other-genes")) %>% 
  mutate(tissue = fct_recode(tissue), tissue = fct_relevel(tissue, c("intestine", "rectum", "excretory-system", "coelomic-system", "epithelial-system", "muscular-system", "nervous-system", "reproductive-system", "other-genes")))
## Warning: Unknown levels in `f`: muscular-system
tissue_genes_totals <- joined_ma %>% mutate(status = case_when(
                         lfc >= 2 & isDE == TRUE ~"enriched",
                         lfc <= 2 & isDE == TRUE ~ "depleted",
                         TRUE ~ "no_sig_diff")) %>% 
  group_by(tissue, status) %>% summarise(genes_per_tissue = n_distinct(WBGeneID)) %>% 
  mutate(mean = 1, lfc = 1, isDE = NA)
## `summarise()` has grouped output by 'tissue'. You can override using the
## `.groups` argument.
ma_tissue_facet <- joined_ma %>%
  ggplot(aes(x = log(mean), y = lfc, color = isDE == TRUE)) +
  geom_point(data = joined_ma %>% select(-tissue), alpha = 0.01, color = "lightgrey") +
  geom_point(alpha = 0.1) +
  scale_color_manual(values = c("black", "red"), name = "q.value < 0.01") +
  geom_hline(yintercept = c(-2,2), linetype = "dashed") +
  # annotate(geom = "text", label = "420", size = 4, x = 1, y = 1) +
  theme_linedraw() +
  facet_wrap(~tissue) +
  ylab("log2FoldChange(GFP+/GFP-)") +
  xlab("log10(mean normalized read counts")
  

ma_tissue_facet_labeled <- ma_tissue_facet + 
  geom_text(data = tissue_genes_totals %>% filter(status == "enriched"), 
                            mapping = aes(x = 6, y = 11,label = paste("enriched genes = ",genes_per_tissue)), 
                            color = "black",
                            size = 3) +
  geom_text(data = tissue_genes_totals %>% filter(status == "depleted"), 
                            mapping = aes(x = 6, y = -6,label = paste("depleted genes = ",genes_per_tissue)), 
                            color = "black",
                            size = 3) 
  

ma_tissue_facet_labeled

export the plot

ggsave(plot = ma_tissue_facet_labeled, file = "../03_output/Embryo_Intestine_MA_per_organ_system_220202.pdf",
       width = 8,
       height = 6)

ggsave(plot = ma_tissue_facet_labeled, file = "../03_output/Embryo_Intestine_MA_per_organ_system_220202.png",
       width = 8,
       height = 6,
       dpi = 700)

Histogram of log2FoldChange facet by tissue

joined_ma %>% filter(isDE == TRUE) %>%
  ggplot(aes(x = lfc)) +
  geom_density(data = joined_ma %>% select(-tissue), alpha = 0.1, color = "grey") +
  geom_density(kernel = "gaussian", alpha = 0.25, aes(fill = tissue)) +
  geom_vline(xintercept = c(-2,2), linetype = "dotted")+
  theme_classic() +
  facet_wrap(~tissue)

Overlap with published embryo intestine genes

im <- initInterMine(mine = listMines()["WormMine"], "m197OeDbhcudmcu143d2")
constraints = setConstraints(
  paths = "Gene.expressionClusters.primaryIdentifier",
  operators = "=",
  values = list("WBPaper00037950:intestine_embryo_enriched")
)
queryGeneIds = setQuery(
  select = c(
    "Gene.primaryIdentifier",
    "Gene.symbol",
    "Gene.expressionClusters.primaryIdentifier"
  ),
  where = constraints
)

spencer_embryo_genes <- runQuery(im = im, qry = queryGeneIds)
head(spencer_embryo_genes)
##   Gene.primaryIdentifier Gene.symbol Gene.expressionClusters.primaryIdentifier
## 1         WBGene00000005       aat-4 WBPaper00037950:intestine_embryo_enriched
## 2         WBGene00000040       aco-1 WBPaper00037950:intestine_embryo_enriched
## 3         WBGene00000067       act-5 WBPaper00037950:intestine_embryo_enriched
## 4         WBGene00000073       add-2 WBPaper00037950:intestine_embryo_enriched
## 5         WBGene00000098       air-1 WBPaper00037950:intestine_embryo_enriched
## 6         WBGene00000100       ajm-1 WBPaper00037950:intestine_embryo_enriched
spencer_ma <- plotMA(resApeglm, alpha = 0.01, returnData = TRUE) %>% rownames_to_column(var = "WBGeneID") %>% 
  mutate(spencer_embryo = WBGeneID %in% spencer_embryo_genes$Gene.primaryIdentifier)

spencer_ma %>% filter(spencer_embryo == TRUE) %>%
  ggplot(aes(x = log(mean), y = lfc, color = isDE)) +
  geom_point(data = spencer_ma %>% select(-spencer_embryo), alpha = 0.1, color = "grey") +
  geom_point(alpha = 0.25)

Tissue specific ground truth

Take a different approach to label a small set of known tissue specific genes

ground_truth <- readxl::read_xlsx(path = "../01_input/Tissue_Specific_Ground_Truth_RTPW.xlsx")
tissue_specific_markers <- read.csv(file = "../01_input/Tissue_Specific_Marker_Genes.tsv", header = TRUE, sep = "\t")

neuron_truth <- ground_truth %>% rowwise() %>% mutate(neuron_count = sum(c_across(ADA:VD_DD))) %>% select(WBGeneID, gene_name, neuron_count)
neuron_truth
## # A tibble: 161 × 3
## # Rowwise: 
##    WBGeneID       gene_name neuron_count
##    <chr>          <chr>            <dbl>
##  1 WBGene00000437 ceh-13               8
##  2 WBGene00003102 mab-5               12
##  3 WBGene00003024 lin-39              14
##  4 WBGene00001174 egl-5               12
##  5 WBGene00003779 nob-1                6
##  6 WBGene00004024 php-3                7
##  7 WBGene00006873 vab-7                6
##  8 WBGene00003912 pal-1                0
##  9 WBGene00000428 ceh-1                2
## 10 WBGene00000434 ceh-9                7
## # … with 151 more rows
ubiquitous_genes <- neuron_truth %>% filter(neuron_count == ground_truth %>% dplyr::select(ADA:VD_DD) %>% ncol()) 
ubiquitous_genes
## # A tibble: 27 × 3
## # Rowwise: 
##    WBGeneID       gene_name neuron_count
##    <chr>          <chr>            <dbl>
##  1 WBGene00000451 ceh-30             128
##  2 WBGene00000444 ceh-21             128
##  3 WBGene00000459 ceh-38             128
##  4 WBGene00000460 ceh-39             128
##  5 WBGene00000462 ceh-41             128
##  6 WBGene00015934 ceh-48             128
##  7 WBGene00000464 ceh-44             128
##  8 WBGene00007417 ceh-58             128
##  9 WBGene00013876 ceh-74             128
## 10 WBGene00018433 ceh-82             128
## # … with 17 more rows
neuron_truth %>% inner_join(tissue_specific_markers, by = c("WBGeneID" = "ExpressionPattern.genes.primaryIdentifier"))
## # A tibble: 36 × 14
## # Rowwise: 
##    WBGeneID       gene_name neuron_count ExpressionPattern.pat… ExpressionPatte…
##    <chr>          <chr>            <dbl> <chr>                  <chr>           
##  1 WBGene00001174 egl-5               12 gfp is expressed in a… Marker35        
##  2 WBGene00000584 cog-1               14 Marker for ASER neuro… Marker97        
##  3 WBGene00000584 cog-1               14 Marker for PDA cell.   Marker114       
##  4 WBGene00006818 unc-86              31 Expressed in HSN neur… Marker79        
##  5 WBGene00006818 unc-86              31 Expressed in HSN neur… Marker79        
##  6 WBGene00003000 lin-11              19 Expressed in secondar… Marker17        
##  7 WBGene00003000 lin-11              19 Expressed in secondar… Marker17        
##  8 WBGene00003000 lin-11              19 Marker for AIZ neuron… Marker103       
##  9 WBGene00003000 lin-11              19 Marker for AIZ neuron… Marker103       
## 10 WBGene00003000 lin-11              19 Marker for secondary … Marker77        
## # … with 26 more rows, and 9 more variables: ExpressionPattern.remark <chr>,
## #   ExpressionPattern.reporterGene <chr>,
## #   ExpressionPattern.subcellularLocalization <chr>,
## #   ExpressionPattern.anatomyTerms.name <chr>,
## #   ExpressionPattern.anatomyTerms.synonym <chr>,
## #   ExpressionPattern.anatomyTerms.definition <chr>,
## #   ExpressionPattern.anatomyTerms.primaryIdentifier <chr>, …
germline_genes <- data.frame(WBGeneID = c("WBGene00001598", "WBGene00003993", "WBGene00003992", "WBGene00010492"), gene_name = c("glh-1", "pgl-2", "pgl-1", "meg-1"))

germline_genes$tissue <- "germline"

ground_truth_markers <- ground_truth %>% mutate(tissue= case_when(
  gene_name == "elt-2" ~ "intestine",
  WBGeneID %in% (neuron_truth %>% inner_join(tissue_specific_markers, by = c("WBGeneID" = "ExpressionPattern.genes.primaryIdentifier")))$WBGeneID ~ "neuron",
  Intestine == 1 & Hypodermis == 0 & Muscle == 0 & Germline == 0 & Pharynx == 0 & Neuron == 0 ~ "intestine",
  Intestine == 0 & Hypodermis == 1 & Muscle == 0 & Germline == 0 & Pharynx == 0 & Neuron == 0 ~ "hypodermis",
  Intestine == 0 & Hypodermis == 0 & Muscle == 1 & Germline == 0 & Pharynx == 0 & Neuron == 0 ~ "muscle",
  Intestine == 0 & Hypodermis == 0 & Muscle == 0 & Germline == 1 & Pharynx == 0 & Neuron == 0 ~ "germline",
  Intestine == 0 & Hypodermis == 0 & Muscle == 0 & Germline == 0 & Pharynx == 1 & Neuron == 0 ~ "pharynx"
  )) %>% 
  dplyr::select(!(ADA:VD_DD)) %>%
  dplyr::select(!(Intestine:Neuron)) %>%
  drop_na(tissue) %>% 
  bind_rows(germline_genes)

ground_truth_markers$tissue <- factor(ground_truth_markers$tissue, levels = c("intestine", "hypodermis", "germline", "pharynx", "muscle", "neuron"))

tissues <- c("intestine", "hypodermis", "germline", "pharynx", "muscle", "neuron")
ordered_genes <- c()
for(i in tissues){
  genes <- (ground_truth_markers %>% filter(tissue == i))$gene_name
  ordered_genes <- append(ordered_genes, genes)
}
ordered_genes
##  [1] "ifb-2"  "act-5"  "elt-2"  "ges-1"  "lon-3"  "bli-1"  "glh-1"  "pgl-2" 
##  [9] "pgl-1"  "meg-1"  "myo-2"  "hlh-1"  "fhod-1" "egl-5"  "cog-1"  "unc-86"
## [17] "lin-11" "ttx-3"  "lim-6"  "unc-4"  "ceh-10" "ceh-36" "sng-1"  "rgef-1"
## [25] "myo-3"
fct_count(ground_truth_markers$tissue)
## # A tibble: 6 × 2
##   f              n
##   <fct>      <int>
## 1 intestine      4
## 2 hypodermis     2
## 3 germline       4
## 4 pharynx        1
## 5 muscle         2
## 6 neuron        12
joined_ma_markers <- plotMA(resApeglm, alpha = 0.01, returnData = TRUE) %>% rownames_to_column(var = "WBGeneID") %>%  left_join(ground_truth_markers, by = "WBGeneID") %>% drop_na(tissue)
  # mutate(tissue = replace_na(tissue, "not_specific"))

joined_ma_markers %>% #filter(isDE == TRUE) %>%
  ggplot(aes(x = log(mean), y = lfc, color = tissue)) +
  geom_point(data = joined_ma %>% select(-tissue), alpha = 0.1, color = "grey") +
  geom_point(alpha = 0.5) +
  geom_hline(yintercept = c(-2,2), linetype = "dotted")+
  theme_classic() +
  facet_wrap(~tissue)